set.seed(1)
require(cluster)
require(clValid)
require(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
require(xlsx)
require(ggplot2)
Skip to Measuring Clustering Effectiveness
Skip to Boxplots of Cluster (by Feature)
x <- read.csv('ProjectStatistics_UNCSlideStainChar.csv')
# Copying dataset for later exporting
Final <- x
colnames(x)
## [1] "scene_name"
## [2] "scene_id"
## [3] "scene_ver"
## [4] "th_BG"
## [5] "Layer.mean.of.Layer.1..unclassified"
## [6] "Layer.mean.of.Layer.2..unclassified"
## [7] "Layer.mean.of.Layer.3..unclassified"
## [8] "th_RL1"
## [9] "th_RL2"
## [10] "th_RL3"
# Removing row 17:
x <- x[,-c(1,2,3)]
x.s <- scale(x)
# hierarchical
hier.s <- hclust(dist(x.s))
HIER.S <- NULL
for(i in 2:12){
HIER.S <- cbind(HIER.S, cutree(hier.s, k = i) )
}
# kmeans
KMEANS.S <- NULL
for(i in 2:12){
KMEANS.S <- cbind(KMEANS.S, kmeans(x.s, centers = i)$cluster )
}
# diana
dia.s <- diana(x.s)
DIA.S <- NULL
for(i in 2:12){
DIA.S <- cbind(DIA.S, cutree(dia.s, i) )
}
# pam
PAM.S <- NULL
for(i in 2:12){
pam <- pam(x.s, i, cluster.only = TRUE)
PAM.S <- cbind(PAM.S, pam)
}
rm(pam)
# hierarchical
hier.u <- hclust(dist(x))
HIER.U <- NULL
for(i in 2:12){
HIER.U <- cbind(HIER.U, cutree(hier.u, k = i) )
}
# kmeans
KMEANS.U <- NULL
for(i in 2:12){
KMEANS.U <- cbind(KMEANS.U, kmeans(x, centers = i)$cluster )
}
# diana
dia.u <- diana(x)
DIA.U <- NULL
for(i in 2:12){
DIA.U <- cbind(DIA.U, cutree(dia.u, i) )
}
# pam
PAM.U <- NULL
for(i in 2:12){
pam <- pam(x, i, cluster.only = TRUE)
PAM.U <- cbind(PAM.U, pam)
}
rm(pam)
Several different measures are used. All metrics but Dunn and Silhouette show more stability as the value decreases; Dunn and Silhouette show more stability as the value increases. Unfortunately, all graphs suggest either two or twelve clusters. This is not reflected in research.
# Euclidean
Euc.s <- clValid(x.s, 2:12, clMethods = c("hierarchical", "kmeans", "diana", "pam"), validation = c("internal", "stability"), maxitems = 800, metric = "euclidean")
## Warning in clValid(x.s, 2:12, clMethods = c("hierarchical", "kmeans",
## "diana", : rownames for data not specified, using 1:nrow(data)
plot(Euc.s)
# Correlation
Cor.s <- clValid(x.s, 2:12, clMethods = c("hierarchical", "kmeans", "diana", "pam"), validation = c("internal", "stability"), maxitems = 800, metric = "correlation")
## Warning in clValid(x.s, 2:12, clMethods = c("hierarchical", "kmeans",
## "diana", : rownames for data not specified, using 1:nrow(data)
plot(Cor.s)
# Euclidean
Euc <- clValid(x, 2:12, clMethods = c("hierarchical", "kmeans", "diana", "pam"), validation = c("internal", "stability"), maxitems = 800, metric = "euclidean")
## Warning in clValid(x, 2:12, clMethods = c("hierarchical", "kmeans",
## "diana", : rownames for data not specified, using 1:nrow(data)
plot(Euc)
# Correlation
Cor <- clValid(x, 2:12, clMethods = c("hierarchical", "kmeans", "diana", "pam"), validation = c("internal", "stability"), maxitems = 800, metric = "correlation")
## Warning in clValid(x, 2:12, clMethods = c("hierarchical", "kmeans",
## "diana", : rownames for data not specified, using 1:nrow(data)
plot(Cor)
colnames(HIER.S) <- colnames(HIER.U) <- colnames(KMEANS.S) <- colnames(KMEANS.U) <- colnames(DIA.S) <- colnames(DIA.U) <- colnames(PAM.S) <- colnames(PAM.U) <- c("two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve")
# Scaled
Final_Hier.s <- cbind(Final, HIER.S)
Final_Kmeans.s <- cbind(Final, KMEANS.S)
Final_Dia.s <- cbind(Final, DIA.S)
Final_Pam.s <- cbind(Final, PAM.S)
# Unscaled
Final_Hier.u <- cbind(Final, HIER.U)
Final_Kmeans.u <- cbind(Final, KMEANS.U)
Final_Dia.u <- cbind(Final, DIA.U)
Final_Pam.u <- cbind(Final, PAM.U)
# Scaled
try(write.xlsx(Final_Hier.s, "ProjectStatistics_UNCSlideStainChar_Cluster_Hier_Scaled.xlsx"))
try(write.xlsx(Final_Kmeans.s, "ProjectStatistics_UNCSlideStainChar_Cluster_Kmeans_Scaled.xlsx"))
try(write.xlsx(Final_Dia.s, "ProjectStatistics_UNCSlideStainChar_Cluster_Dia_Scaled.xlsx"))
try(write.xlsx(Final_Pam.s, "ProjectStatistics_UNCSlideStainChar_Cluster_Pam_Scaled.xlsx"))
# Unscaled
try(write.xlsx(Final_Hier.u, "ProjectStatistics_UNCSlideStainChar_Cluster_Hier_Unscaled.xlsx"))
try(write.xlsx(Final_Kmeans.u, "ProjectStatistics_UNCSlideStainChar_Cluster_Kmeans_Unscaled.xlsx"))
try(write.xlsx(Final_Dia.u, "ProjectStatistics_UNCSlideStainChar_Cluster_Dia_Unscaled.xlsx"))
try(write.xlsx(Final_Pam.u, "ProjectStatistics_UNCSlideStainChar_Cluster_Pam_Unscaled.xlsx"))
Hier_s <- read.xlsx( "ProjectStatistics_UNCSlideStainChar_Cluster_Hier_Scaled.xlsx", sheetIndex = 1)
Kmeans_s <- read.xlsx("ProjectStatistics_UNCSlideStainChar_Cluster_Kmeans_Scaled.xlsx", sheetIndex = 1)
Diana_s <- read.xlsx("ProjectStatistics_UNCSlideStainChar_Cluster_Dia_Scaled.xlsx", sheetIndex = 1)
Pam_s <- read.xlsx("ProjectStatistics_UNCSlideStainChar_Cluster_Pam_Scaled.xlsx", sheetIndex = 1)
Hier_u <- read.xlsx("ProjectStatistics_UNCSlideStainChar_Cluster_Hier_Unscaled.xlsx", sheetIndex = 1)
Kmeans_u <- read.xlsx("ProjectStatistics_UNCSlideStainChar_Cluster_Kmeans_Unscaled.xlsx", sheetIndex = 1)
Diana_u <- read.xlsx("ProjectStatistics_UNCSlideStainChar_Cluster_Dia_Unscaled.xlsx", sheetIndex = 1)
Pam_u <- read.xlsx("ProjectStatistics_UNCSlideStainChar_Cluster_Pam_Unscaled.xlsx", sheetIndex = 1)
Box_by_Cluster <- function(x, Cluster, Cl.type = NULL){
# factorizing all cluster assingments
for(V_i in 12:22){
x[,V_i] <- as.factor(x[,V_i])
}
# Printing Boxplots
for(V_i in 5:11){
V <- colnames(x)[V_i]
g <- ggplot(x, aes_string(x = Cluster, y = V))
print(g + geom_boxplot() +
labs(title = paste("Box Plot of",Cl.type,"Clustering"),
x = "Cluster"))
}
}
Box_all_clusters <- function(x, Cl.type2 = NULL){
for(Cl.strings in c("two","three","four","five","six","seven","eight","nine","ten","eleven","twelve")){
print(paste("Summary for",Cl.strings,"clusters."))
Box_by_Cluster(x, Cl.strings, Cl.type = Cl.type2)
}
}
Box_all_clusters(Hier_s, "Hierarchical Scaled")
## [1] "Summary for two clusters."
## [1] "Summary for three clusters."
## [1] "Summary for four clusters."
## [1] "Summary for five clusters."
## [1] "Summary for six clusters."
## [1] "Summary for seven clusters."
## [1] "Summary for eight clusters."
## [1] "Summary for nine clusters."
## [1] "Summary for ten clusters."
## [1] "Summary for eleven clusters."
## [1] "Summary for twelve clusters."
Box_all_clusters(Kmeans_s, "K-Means Scaled")
## [1] "Summary for two clusters."
## [1] "Summary for three clusters."
## [1] "Summary for four clusters."
## [1] "Summary for five clusters."
## [1] "Summary for six clusters."
## [1] "Summary for seven clusters."
## [1] "Summary for eight clusters."
## [1] "Summary for nine clusters."
## [1] "Summary for ten clusters."
## [1] "Summary for eleven clusters."
## [1] "Summary for twelve clusters."
Box_all_clusters(Diana_s, "Diana Scaled")
## [1] "Summary for two clusters."
## [1] "Summary for three clusters."
## [1] "Summary for four clusters."
## [1] "Summary for five clusters."
## [1] "Summary for six clusters."
## [1] "Summary for seven clusters."
## [1] "Summary for eight clusters."
## [1] "Summary for nine clusters."
## [1] "Summary for ten clusters."
## [1] "Summary for eleven clusters."
## [1] "Summary for twelve clusters."
Box_all_clusters(Pam_s, "Pam Scaled")
## [1] "Summary for two clusters."
## [1] "Summary for three clusters."
## [1] "Summary for four clusters."
## [1] "Summary for five clusters."
## [1] "Summary for six clusters."
## [1] "Summary for seven clusters."
## [1] "Summary for eight clusters."
## [1] "Summary for nine clusters."
## [1] "Summary for ten clusters."
## [1] "Summary for eleven clusters."
## [1] "Summary for twelve clusters."
Box_all_clusters(Hier_u, "Hierarchical Unscaled")
## [1] "Summary for two clusters."
## [1] "Summary for three clusters."
## [1] "Summary for four clusters."
## [1] "Summary for five clusters."
## [1] "Summary for six clusters."
## [1] "Summary for seven clusters."
## [1] "Summary for eight clusters."
## [1] "Summary for nine clusters."
## [1] "Summary for ten clusters."
## [1] "Summary for eleven clusters."
## [1] "Summary for twelve clusters."
Box_all_clusters(Kmeans_u, "K-Means Unscaled")
## [1] "Summary for two clusters."
## [1] "Summary for three clusters."
## [1] "Summary for four clusters."
## [1] "Summary for five clusters."
## [1] "Summary for six clusters."
## [1] "Summary for seven clusters."
## [1] "Summary for eight clusters."
## [1] "Summary for nine clusters."
## [1] "Summary for ten clusters."
## [1] "Summary for eleven clusters."
## [1] "Summary for twelve clusters."
Box_all_clusters(Diana_u, "Diana Unscaled")
## [1] "Summary for two clusters."
## [1] "Summary for three clusters."
## [1] "Summary for four clusters."
## [1] "Summary for five clusters."
## [1] "Summary for six clusters."
## [1] "Summary for seven clusters."
## [1] "Summary for eight clusters."
## [1] "Summary for nine clusters."
## [1] "Summary for ten clusters."
## [1] "Summary for eleven clusters."
## [1] "Summary for twelve clusters."
Box_all_clusters(Pam_u, "Pam Unscaled")
## [1] "Summary for two clusters."
## [1] "Summary for three clusters."
## [1] "Summary for four clusters."
## [1] "Summary for five clusters."
## [1] "Summary for six clusters."
## [1] "Summary for seven clusters."
## [1] "Summary for eight clusters."
## [1] "Summary for nine clusters."
## [1] "Summary for ten clusters."
## [1] "Summary for eleven clusters."
## [1] "Summary for twelve clusters."